library(readr)
dataset <- read_csv("C:/Users/User/OneDrive/Desktop/R folder/R project/data_with_derived_vars.csv")
#View(dataset)
summary(dataset)
# Load dplyr
library(dplyr)
library(caret)
library(pROC)

# Find the data type of each column
data_types <- sapply(dataset, class)
data_types

#missing value
missing_values <- colSums(is.na(dataset))
missing_values

# Convert numerical data to categorical
dataset$SEX <- factor(dataset$SEX, levels = c(1,2))
dataset$EDUCATION <- factor(dataset$EDUCATION, levels = c(1,2,3,4))
dataset$MARRIAGE <- factor(dataset$MARRIAGE, levels = c(1,2,3))
dataset$PAY_0 <- factor(dataset$PAY_0, levels = c(-1,0,1,2,3,4,5,6,7,8,9))
dataset$PAY_2 <- factor(dataset$PAY_2, levels = c(-1,0,1,2,3,4,5,6,7,8,9))
dataset$PAY_3 <- factor(dataset$PAY_3, levels = c(-1,0,1,2,3,4,5,6,7,8,9))
dataset$PAY_4 <- factor(dataset$PAY_4, levels = c(-1,0,1,2,3,4,5,6,7,8,9))
dataset$PAY_5 <- factor(dataset$PAY_5, levels = c(-1,0,1,2,3,4,5,6,7,8,9))
dataset$PAY_6 <- factor(dataset$PAY_6, levels = c(-1,0,1,2,3,4,5,6,7,8,9))
dataset$Y <- factor(dataset$Y, levels = c(0,1))
data <- na.omit(dataset)  

# missing value with median
data$ratio_payamt1[is.na(data$ratio_payamt1)] <- median(data$ratio_payamt1, na.rm = TRUE)
data$ratio_payamt2[is.na(data$ratio_payamt2)] <- median(data$ratio_payamt2, na.rm = TRUE)
data$ratio_payamt3[is.na(data$ratio_payamt3)] <- median(data$ratio_payamt3, na.rm = TRUE)
data$ratio_payamt4[is.na(data$ratio_payamt4)] <- median(data$ratio_payamt4, na.rm = TRUE)
data$ratio_payamt5[is.na(data$ratio_payamt5)] <- median(data$ratio_payamt5, na.rm = TRUE)
data$ratio_payamt6[is.na(data$ratio_payamt6)] <- median(data$ratio_payamt6, na.rm = TRUE)
data$max_ratio_last3month[is.na(data$max_ratio_last3month)] <- median(data$max_ratio_last3month, na.rm = TRUE)
data$max_ratio_last6month[is.na(data$max_ratio_last6month)] <- median(data$max_ratio_last6month, na.rm = TRUE)
data$min_ratio_last3month[is.na(data$min_ratio_last3month)] <- median(data$min_ratio_last3month, na.rm = TRUE)
data$min_ratio_last6month[is.na(data$min_ratio_last6month)] <- median(data$min_ratio_last6month, na.rm = TRUE)

# Find the data type of each column
data_types <- sapply(data, class)
data_types

#missing value
missing_values <- colSums(is.na(data))
missing_values
--------------------------------------------------------------------------------------
#outlaiers
summary(data) 

#pair plot
  
#create a matrix of scatter plots
pairs(~ LIMIT_BAL + AGE + BILL_AMT1 + BILL_AMT2 + BILL_AMT3
      + BILL_AMT4 + BILL_AMT5 + BILL_AMT6, data=data,
      main="Simple Scatterplot Matrix")  
  


# Identify numeric columns only
numeric_data <- data[, sapply(data, is.numeric)]

boundaries <- list()

#loop
for (col in names(numeric_data)) {
  # Calculate Q1, Q3, and IQR
  Q1 <- quantile(numeric_data[[col]], 0.25, na.rm = TRUE)
  Q3 <- quantile(numeric_data[[col]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  
  # Calculate lower and upper boundaries
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  #print
  cat("Column:", col, "\n")
  cat("  Lower Bound:", lower_bound, "\n")
  cat("  Upper Bound:", upper_bound, "\n\n")
  
  # Store the boundaries 
  boundaries[[col]] <- list(lower_bound = lower_bound, upper_bound = upper_bound)
  
  # Replace outliers
  numeric_data[[col]][numeric_data[[col]] < lower_bound] <- lower_bound
  numeric_data[[col]][numeric_data[[col]] > upper_bound] <- upper_bound
}

# Replace the numeric columns in the original dataset
data[, names(numeric_data)] <- numeric_data


head(data)

#print
cat("\nSummary of Boundaries for All Columns:\n")
print(boundaries)

summary(data)  
  
------------------------------------------------------------------------------------------------------   
#Significance testing
  
# Select numeric columns and deleting 0 sd columns
  
df_numaric<- data %>% select(-cnt_delay_last3month,-avg_delay_last3month,-ratio_delay_trending)
summary(df_numaric)
numeric_columns <- df_numaric[sapply(df_numaric, is.numeric)]

df_numaric_Y <- numeric_columns %>% mutate(Y=data$Y)
View(df_numaric_Y)

significance_test_numeric <- glm(Y ~ ., data = df_numaric_Y, family = binomial())  
summary(significance_test_numeric)


# significance of categorical variables

significance_test_categorical <- glm(Y ~ SEX+ EDUCATION+ MARRIAGE+ PAY_0+ PAY_2+ PAY_3+
                                       PAY_4+ PAY_5+ PAY_6, data = data, family = binomial())  
summary(significance_test_categorical)

---------------------------------------------------------------------------------------------------------
# Heat map & multicollinearity

numeric_columns <- df_numaric[sapply(df_numaric, is.numeric)]

# Compute the correlation matrix for numeric columns
cor_matrix <- cor(numeric_columns[, sapply(numeric_columns, is.numeric)])
View(cor_matrix)


heatmap(cor_matrix, 
        main = "Correlation Heatmap", 
        symm = TRUE, 
        col = colorRampPalette(c("blue", "white", "red"))(10))

# Find columns with correlation above 0.9
high_corr <- findCorrelation(cor_matrix, cutoff = 0.9)
print(names(numeric_columns)[high_corr])
data_numeric_cleaned <- numeric_columns[, -high_corr]
View(data_numeric_cleaned)
---------------------------------------------------------------------------------------------
# reduce multicollinearity , 
  
numeric_columns <- df_numaric[sapply(df_numaric, is.numeric)]
pca <- prcomp(numeric_columns[, sapply(numeric_columns, is.numeric)], scale = TRUE) 
pca_result <- pca$x[, 1:5]
df_pca <- as.data.frame(pca_result)%>% mutate(Y=data$Y) 

logit.mod <- glm(Y ~ ., data = df_pca, family = binomial)
summary(logit.mod)

------------------------------------------------------------------------------------------
# variable selection
  
numeric_columns <- df_numaric[sapply(df_numaric, is.numeric)]
pca <- prcomp(numeric_columns[, sapply(numeric_columns, is.numeric)], scale = TRUE) 
pca_result <- pca$x[, 1:20]
df_pca <- as.data.frame(pca_result)%>% mutate(Y=data$Y) 
df_pca <- df_pca %>% 
  mutate(SEX = data$SEX,
         EDUCATION = data$EDUCATION,
         MARRIAGE = data$MARRIAGE,
         PAY_0 = data$PAY_0,
         PAY_2 = data$PAY_2,
         PAY_3 = data$PAY_3,
         PAY_4 = data$PAY_4,
         PAY_5 = data$PAY_5,
         PAY_6 = data$PAY_6)
#View(df_pca)

logit.pca <- glm(Y ~ ., data = df_pca, family = binomial)

# Get predicted probabilities
predicted_probs <- predict(logit.pca, type = "response")

# ROC and AUC
roc_curve_pca <- roc(df_pca$Y, predicted_probs)
auc_value_pca <- auc(roc_curve_pca)

print(auc_value_pca)   

---------------------------------------------------------------------------------------------
# Fit a logistic regression model (assuming 'data' is your dataset)
logit.mod <- glm(Y ~ ., data = data, family = binomial)

# Get predicted probabilities
predicted_probs <- predict(logit.mod, type = "response")

# ROC curve and AUC
roc_curve <- roc(data$Y, predicted_probs)
auc_value <- auc(roc_curve)

print(auc_value) 
---------------------------------------------------------------------------------------  

library(leaps)

# Forward selection
forward_model <- regsubsets(Y ~ ., data =df_pca, method = "forward")
best_5_vars <- coef(forward_model, 8)
best_5_vars
  
data_without_high_corr <- data_numeric_cleaned %>%
  mutate(SEX = data$SEX,
         EDUCATION = data$EDUCATION,
         MARRIAGE = data$MARRIAGE,
         PAY_0 = data$PAY_0,
         PAY_2 = data$PAY_2,
         PAY_3 = data$PAY_3,
         PAY_4 = data$PAY_4,
         PAY_5 = data$PAY_5,
         PAY_6 = data$PAY_6,
         Y=data$Y)
View(data_without_high_corr)

forward_model <- regsubsets(Y ~ ., data =data_without_high_corr, method = "forward")
best_7_vars <- coef(forward_model, 7)
best_7_vars

# Fit a logistic regression model (assuming 'data' is your dataset)
logit.forward_mod <- glm(Y ~ LIMIT_BAL+ max_delay_last3month+max_delay_last6month
                         + max_ratio_last6month + PAY_0+PAY_5 , data = data_without_high_corr, family = binomial)

# Get predicted probabilities
predicted_probs <- predict(logit.forward_mod, type = "response")

# Calculate the ROC curve and AUC
roc_curve_forward <- roc(data$Y, predicted_probs)
auc_value_forward <- auc(roc_curve_forward)

print(auc_value_forward)   
----------------------------------------------------------------------------------------  
# Fit a logistic regression model (assuming 'data' is your dataset)
logit.mod <- glm(Y ~ ., data = data_without_high_corr, family = binomial)

# Get predicted probabilities
predicted_probs <- predict(logit.mod, type = "response")

# ROC curve and AUC
roc_curve <- roc(data$Y, predicted_probs)
auc_value <- auc(roc_curve)

print(auc_value)  
----------------------------------------------------------------------------------------  

#Univariate Analysis:
  
#class imbalance 
  
# Convert the table to a data frame
imbalance_class <- as.data.frame(table(data$Y))

# Load ggplot2 library
library(ggplot2)

# Plot the data using ggplot2
ggplot(imbalance_class) + 
  geom_bar(aes(x = Var1, y = Freq),stat = "identity", fill = "blue", color = "black") +   
  ggtitle("Frequency Distribution of Y") +
  labs(x = "Class", y = "Count") 

  
# Education vs Defaulter

# Group and count the data
grouped_data <- data %>%
  group_by(EDUCATION, Y) %>%
  summarise(Count = n())

# Plot the data
ggplot(grouped_data) +
  geom_bar(aes(x = EDUCATION, y = Count, fill = Y),stat = "identity", position = "dodge") +
  labs(title = "Clustered Bar Diagram of Y by EDUCATION", x = "Education", y = "Count",
       fill = "Y") 
  
# sex vs Defaulter  

# Group and count the data
grouped_data <- data %>%
  group_by(SEX, Y) %>%
  summarise(Count = n())

# Plot the data
ggplot(grouped_data) +
  geom_bar(aes(x = SEX, y = Count, fill = Y),stat = "identity", position = "dodge") +
  labs(title = "Clustered Bar Diagram of Y by EDUCATION", x = "Education", y = "Count",
       fill = "Y")   
  
# Age group vs Defaulter    
  
# Define the breakpoints and labels for binning the 'meandelay' column
breaks <- c(20,40, 60, 80)  # Delay bins: 0-5, 5-10, and >10
labels <- c("Young", "Middle", "Old")  # Corresponding labels
#data$AGE <- as.numeric(data$AGE)
# Create a new column 'delay_category' by binning the 'meandelay' column
Age_group_data<-data%>%
  mutate(Age_group=cut(data$AGE, breaks = breaks,labels=labels, right = FALSE))

#data$AGE <- cut(data$AGE, breaks = breaks, right = FALSE)
View(Age_group_data)

# Group and count the data
grouped_data <- Age_group_data %>%
  group_by(Age_group, Y) %>%
  summarise(Count = n())

# Plot the data
ggplot(grouped_data) +
  geom_bar(aes(x = Age_group, y = Count, fill = Y),stat = "identity", position = "dodge") +
  labs(title = "Clustered Bar Diagram of Y by Age group", x = "Age Group", y = "Count",
       fill = "Y")     
  

  
  
  
  
  
  
  
  
    
  
  